{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random\n",
    "import pickle\n",
    "import jsonlines\n",
    "import tqdm\n",
    "import networkx as nx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load dict from file\n",
    "with open(\"/<YOUR_OWN_PATH>/ToolQA/data/external_corpus/dblp/title2id_dict.pkl\", \"rb\") as f:\n",
    "    title2id_dict = pickle.load(f)\n",
    "with open(\"/<YOUR_OWN_PATH>/ToolQA/data/external_corpus/dblp/author2id_dict.pkl\", \"rb\") as f:\n",
    "    author2id_dict = pickle.load(f)\n",
    "with open(\"/<YOUR_OWN_PATH>/ToolQA/data/external_corpus/dblp/id2title_dict.pkl\", \"rb\") as f:\n",
    "    id2title_dict = pickle.load(f)\n",
    "with open(\"/<YOUR_OWN_PATH>/ToolQA/data/external_corpus/dblp/id2author_dict.pkl\", \"rb\") as f:\n",
    "    id2author_dict = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the graphs from a file\n",
    "with open('/<YOUR_OWN_PATH>/ToolQA/data/external_corpus/dblp/paper_net.pkl', 'rb') as f:\n",
    "    paper_net = pickle.load(f)\n",
    "\n",
    "with open('/<YOUR_OWN_PATH>/ToolQA/data/external_corpus/dblp/author_net.pkl', 'rb') as f:\n",
    "    author_net = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_question_per_template = 10\n",
    "question_id = 0\n",
    "questions = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [00:00<00:00, 173.47it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-dblp-0000', 'question': 'How many people does Lili He need to know at least to know Zhikang Shuai in the DBLP citation network?', 'answer': '1'}, {'qid': 'hard-dblp-0001', 'question': 'How many people does Hiroshi Ishiguro need to know at least to know Manabu Ikeda in the DBLP citation network?', 'answer': '1'}, {'qid': 'hard-dblp-0002', 'question': 'How many people does Badawy Reham need to know at least to know Bloem Bastiaan in the DBLP citation network?', 'answer': '1'}, {'qid': 'hard-dblp-0003', 'question': 'How many people does Cagatay Goncu need to know at least to know Samuel Reinders in the DBLP citation network?', 'answer': '1'}, {'qid': 'hard-dblp-0004', 'question': 'How many people does Mark de Reuver need to know at least to know Jafar Rezaei in the DBLP citation network?', 'answer': '1'}, {'qid': 'hard-dblp-0005', 'question': 'How many people does Pratheepan Yogarajah need to know at least to know Sonya Coleman in the DBLP citation network?', 'answer': '1'}, {'qid': 'hard-dblp-0006', 'question': 'How many people does Jungyeul Park need to know at least to know KyungTae Lim in the DBLP citation network?', 'answer': '1'}, {'qid': 'hard-dblp-0007', 'question': 'How many people does Muhammad Akhtar Munir need to know at least to know Muhammad Haris Khan in the DBLP citation network?', 'answer': '1'}, {'qid': 'hard-dblp-0008', 'question': 'How many people does Raman Kumar need to know at least to know Vikram Goyal in the DBLP citation network?', 'answer': '1'}, {'qid': 'hard-dblp-0009', 'question': 'How many people does Aleksandra Revina need to know at least to know Nina Rizun in the DBLP citation network?', 'answer': '1'}]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# How many people does {author1} need to know at least to know {author2} in the DBLP citation network?\n",
    "for i in tqdm.tqdm(range(num_question_per_template)):\n",
    "    # randomly choose a paper\n",
    "    author_id1 = random.choice(list(id2author_dict.keys()))\n",
    "    name1 = id2author_dict[author_id1]\n",
    "    candidate_list = list(author_net.neighbors(author_id1))\n",
    "    while candidate_list == []:\n",
    "        author_id1 = random.choice(list(id2author_dict.keys()))\n",
    "        name1 = id2author_dict[author_id1]\n",
    "        candidate_list = list(author_net.neighbors(author_id1))\n",
    "    author_id2 = random.choice(candidate_list)\n",
    "    name2 = id2author_dict[author_id2]\n",
    "    question = \"How many people does {} need to know at least to know {} in the DBLP citation network?\".format(name1, name2)\n",
    "    answer = str(nx.shortest_path_length(author_net, author_id1, author_id2))\n",
    "    questions.append({\"qid\": \"hard-dblp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [00:00<00:00, 174.10it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-dblp-0010', 'question': 'How many common collaborators does Luigi Asprino have with Aldo Gangemi?', 'answer': '0'}, {'qid': 'hard-dblp-0011', 'question': 'How many common collaborators does Ananya Misra have with Thibault Doutre?', 'answer': '8'}, {'qid': 'hard-dblp-0012', 'question': 'How many common collaborators does Guoxian Yu have with Qiaoyu Tan?', 'answer': '3'}, {'qid': 'hard-dblp-0013', 'question': 'How many common collaborators does Jinhao Wu have with Jia, Riheng?', 'answer': '4'}, {'qid': 'hard-dblp-0014', 'question': 'How many common collaborators does Abdullah Mohammed Al-Majid have with Sammer Yousuf?', 'answer': '4'}, {'qid': 'hard-dblp-0015', 'question': 'How many common collaborators does Deli Zhao have with Jiapeng Zhu?', 'answer': '4'}, {'qid': 'hard-dblp-0016', 'question': 'How many common collaborators does Zhiwei Yu have with Shuang Chen?', 'answer': '4'}, {'qid': 'hard-dblp-0017', 'question': 'How many common collaborators does Ran Liu have with Xi Chen?', 'answer': '1'}, {'qid': 'hard-dblp-0018', 'question': 'How many common collaborators does Chen Jiang have with Xudong Yang?', 'answer': '10'}, {'qid': 'hard-dblp-0019', 'question': 'How many common collaborators does Jakub Montewka have with Spyros Hirdaris?', 'answer': '2'}]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# How many common collaborators does {author} have with {author}?\n",
    "for i in tqdm.tqdm(range(num_question_per_template)):\n",
    "    author_id1 = random.choice(list(id2author_dict.keys()))\n",
    "    name1 = id2author_dict[author_id1]\n",
    "    candidate_list = list(author_net.neighbors(author_id1))\n",
    "    while candidate_list == []:\n",
    "        author_id1 = random.choice(list(id2author_dict.keys()))\n",
    "        name1 = id2author_dict[author_id1]\n",
    "        candidate_list = list(author_net.neighbors(author_id1))\n",
    "    author_id2 = random.choice(candidate_list)\n",
    "    name2 = id2author_dict[author_id2]\n",
    "    question = \"How many common collaborators does {} have with {}?\".format(name1, name2)\n",
    "    answer = str(len(list(nx.common_neighbors(author_net, author_id1, author_id2))))\n",
    "    questions.append({\"qid\": \"hard-dblp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [00:00<00:00, 119.96it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-dblp-0020', 'question': 'Which is the most cited paper written by Andreas Vlachos in the DBLP citation network?', 'answer': 'FEVEROUS: Fact Extraction and VERification Over Unstructured and Structured information.'}, {'qid': 'hard-dblp-0021', 'question': 'Which is the most cited paper written by Yanshuai Cao in the DBLP citation network?', 'answer': 'Turing: An Accurate And Interpretable Multi-Hypothesis Cross-Domain Natural Language Database Interface'}, {'qid': 'hard-dblp-0022', 'question': 'Which is the most cited paper written by Taru Jain in the DBLP citation network?', 'answer': 'Multitask Learning for Emotionally Analyzing Sexual Abuse Disclosures'}, {'qid': 'hard-dblp-0023', 'question': 'Which is the most cited paper written by Guanyu Lin in the DBLP citation network?', 'answer': 'Dual Contrastive Network for Sequential Recommendation'}, {'qid': 'hard-dblp-0024', 'question': 'Which is the most cited paper written by Yashoteja Prabhu in the DBLP citation network?', 'answer': 'Generalized Zero-Shot Extreme Multi-label Learning'}, {'qid': 'hard-dblp-0025', 'question': 'Which is the most cited paper written by Jiefeng Hu in the DBLP citation network?', 'answer': 'Adaptive Droop Control Using Adaptive Virtual Impedance for Microgrids with Variable PV Outputs and Load Demands'}, {'qid': 'hard-dblp-0026', 'question': 'Which is the most cited paper written by Sufeng Niu in the DBLP citation network?', 'answer': 'Performance-Adaptive Sampling Strategy Towards Fast and Accurate Graph Neural Networks'}, {'qid': 'hard-dblp-0027', 'question': 'Which is the most cited paper written by Łukasz Orlikowski in the DBLP citation network?', 'answer': 'Reachability in Vector Addition Systems is Ackermann-complete'}, {'qid': 'hard-dblp-0028', 'question': 'Which is the most cited paper written by Toru Akishita in the DBLP citation network?', 'answer': 'Adversarial Attacks for Tabular Data: Application to Fraud Detection and  Imbalanced Data'}, {'qid': 'hard-dblp-0029', 'question': 'Which is the most cited paper written by Wei Tong in the DBLP citation network?', 'answer': 'Practical Location Privacy Attacks and Defense on Point-of-interest Aggregates'}]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# Which is the most cited paper written by {author} in the DBLP citation network?\n",
    "for i in tqdm.tqdm(range(num_question_per_template)):\n",
    "    answer = \"\"\n",
    "    while answer == \"\":\n",
    "        author_id = random.choice(list(id2author_dict.keys()))\n",
    "        name = id2author_dict[author_id]\n",
    "        question = \"Which is the most cited paper written by {} in the DBLP citation network?\".format(name)\n",
    "        max_citation = 0\n",
    "        for neighbour in author_net.neighbors(author_id):\n",
    "            for paper in author_net[author_id][neighbour][\"papers\"]:\n",
    "                if paper_net.nodes[paper][\"n_citation\"] > max_citation:\n",
    "                    max_citation = paper_net.nodes[paper][\"n_citation\"]\n",
    "                    answer = paper_net.nodes[paper][\"title\"]\n",
    "    questions.append({\"qid\": \"hard-dblp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'qid': 'hard-dblp-0039', 'question': 'Which collaborator does S. Thomas George have the most citations with in the DBLP citation network?', 'answer': 'Sanjukta Rani Jena'}\n"
     ]
    }
   ],
   "source": [
    "# Which collaborator does {author} have the most cited paper with in the DBLP citation network?\n",
    "for i in range(num_question_per_template):\n",
    "    answer = \"\"\n",
    "    while answer == \"\":\n",
    "        author_id = random.choice(list(id2author_dict.keys()))\n",
    "        name = id2author_dict[author_id]\n",
    "        question = \"Which collaborator does {} have the most citations with in the DBLP citation network?\".format(name)\n",
    "        \n",
    "        max_citation = 0\n",
    "        for neighbour in author_net.neighbors(author_id):\n",
    "            citation = 0\n",
    "            for paper in author_net[author_id][neighbour][\"papers\"]:\n",
    "                citation += paper_net.nodes[paper][\"n_citation\"]\n",
    "            if citation > max_citation:\n",
    "                max_citation = citation\n",
    "                answer = id2author_dict[neighbour]\n",
    "    questions.append({\"qid\": \"hard-dblp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "    question_id += 1\n",
    "print(questions[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'qid': 'hard-dblp-0049', 'question': 'Which venue does Chenguang He publish the most papers in the DBLP citation network?', 'answer': 'IWCMC 2021: 2021 17TH INTERNATIONAL WIRELESS COMMUNICATIONS & MOBILE COMPUTING CONFERENCE (IWCMC),IWCMC 2021: 2021 17TH INTERNATIONAL WIRELESS COMMUNICATIONS & MOBILE COMPUTING CONFERENCE (IWCMC),Mobile Networks and Applications'}\n"
     ]
    }
   ],
   "source": [
    "# Which venue does {author} publish the most papers in the DBLP citation network?\n",
    "for i in range(num_question_per_template):\n",
    "    answer = \"\"\n",
    "    while answer == \"\":\n",
    "        author_id = random.choice(list(id2author_dict.keys()))\n",
    "        name = id2author_dict[author_id]\n",
    "        question = \"Which venue does {} publish the most papers in the DBLP citation network?\".format(name)\n",
    "        \n",
    "        max_paper = 0\n",
    "        for neighbour in author_net.neighbors(author_id):\n",
    "            paper_num = 0\n",
    "            for paper in author_net[author_id][neighbour][\"papers\"]:\n",
    "                paper_num += 1\n",
    "            if paper_num > max_paper:\n",
    "                max_paper = paper_num\n",
    "                answer = [paper_net.nodes[paper][\"venue\"][\"raw\"]]\n",
    "            else:\n",
    "                answer.append(paper_net.nodes[paper][\"venue\"][\"raw\"])\n",
    "        answer = \",\".join(answer)\n",
    "        questions.append({\"qid\": \"hard-dblp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "        question_id += 1\n",
    "print(questions[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'qid': 'hard-dblp-0059', 'question': 'How many accumulated citations do papers collaborated by Chathura Wanigasekara and Akshya Swain have in the DBLP citation network?', 'answer': '1'}\n"
     ]
    }
   ],
   "source": [
    "# How many accumulated citations do papers collaborated by {author1} and {author2} have in the DBLP citation network?\n",
    "for i in range(num_question_per_template):\n",
    "    author_id1 = random.choice(list(id2author_dict.keys()))\n",
    "    name1 = id2author_dict[author_id1]\n",
    "    candidate_list = list(author_net.neighbors(author_id1))\n",
    "    while candidate_list == []:\n",
    "        author_id1 = random.choice(list(id2author_dict.keys()))\n",
    "        name1 = id2author_dict[author_id1]\n",
    "        candidate_list = list(author_net.neighbors(author_id1))\n",
    "    author_id2 = random.choice(candidate_list)\n",
    "    name2 = id2author_dict[author_id2]\n",
    "    question = \"How many accumulated citations do papers collaborated by {} and {} have in the DBLP citation network?\".format(name1, name2)\n",
    "    answer = 0\n",
    "    for paper in author_net[author_id1][author_id2][\"papers\"]:\n",
    "        answer += paper_net.nodes[paper][\"n_citation\"]\n",
    "    answer = str(answer)\n",
    "    questions.append({\"qid\": \"hard-dblp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "    question_id += 1\n",
    "print(questions[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "How many papers in all do Han Wu and his/her collaborators have in the DBLP citation network? 100\n",
      "How many papers in all do Guang Cheng and his/her collaborators have in the DBLP citation network? 33\n",
      "How many papers in all do Jiayu Dong and his/her collaborators have in the DBLP citation network? 10\n",
      "How many papers in all do Duc-Dung Tran and his/her collaborators have in the DBLP citation network? 593\n",
      "How many papers in all do Atsushi Keyaki and his/her collaborators have in the DBLP citation network? 63\n",
      "How many papers in all do Pietro Ferraro and his/her collaborators have in the DBLP citation network? 0\n",
      "How many papers in all do Chejian Xu and his/her collaborators have in the DBLP citation network? 2133\n",
      "How many papers in all do Irvin Aloise and his/her collaborators have in the DBLP citation network? 348\n",
      "How many papers in all do Hongping Wang and his/her collaborators have in the DBLP citation network? 103\n",
      "How many papers in all do Claire O'Shea and his/her collaborators have in the DBLP citation network? 315\n",
      "{'qid': 'hard-dblp-0069', 'question': \"How many papers in all do Claire O'Shea and his/her collaborators have in the DBLP citation network?\", 'answer': 315}\n"
     ]
    }
   ],
   "source": [
    "# How many papers in all do {author} and his/her collaborators have in the DBLP citation network?\n",
    "for i in range(num_question_per_template):\n",
    "    answer = 720000\n",
    "    while answer > 500000:\n",
    "        author_id = random.choice(list(id2author_dict.keys()))\n",
    "        name = id2author_dict[author_id]\n",
    "        question = \"How many papers in all do {} and his/her collaborators have in the DBLP citation network?\".format(name)\n",
    "        answer = []\n",
    "        answer.append(author_id)\n",
    "        for neighbour in author_net.neighbors(author_id):\n",
    "            # answer += len(author_net[author_id][neighbour][\"papers\"])\n",
    "            answer.append(neighbour)\n",
    "            for neighbour2 in author_net.neighbors(neighbour):\n",
    "                if neighbour2 != author_id:\n",
    "                    # answer += len(author_net[neighbour][neighbour2][\"papers\"])\n",
    "                    answer.append(neighbour2)\n",
    "        answer = set(answer)\n",
    "        subgraph = author_net.subgraph(answer)\n",
    "        answer = subgraph.number_of_edges()\n",
    "    print(question, answer)\n",
    "    questions.append({\"qid\": \"hard-dblp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "    question_id += 1\n",
    "print(questions[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [00:00<00:00, 179.76it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'qid': 'hard-dblp-0079', 'question': 'Who collaborated with Vincent Lepetit most in the DBLP citation network?', 'answer': 'Sinisa Stekovic, Friedrich Fraundorfer, Mahdi Rad'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# Who collarborated with {author} most in the DBLP citation network?\n",
    "for i in tqdm.tqdm(range(num_question_per_template)):\n",
    "    # randomly choose a paper\n",
    "    author_id = random.choice(list(id2author_dict.keys()))\n",
    "    name = id2author_dict[author_id]\n",
    "    question = \"Who collaborated with {} most in the DBLP citation network?\".format(name)\n",
    "    max_weight = 0\n",
    "    max_author = []\n",
    "    for neighbor in author_net.neighbors(author_id):\n",
    "        if author_net[author_id][neighbor][\"weight\"] > max_weight:\n",
    "            max_weight = author_net[author_id][neighbor][\"weight\"]\n",
    "            max_author = [id2author_dict[neighbor]]\n",
    "        elif author_net[author_id][neighbor][\"weight\"] == max_weight:\n",
    "            max_author.append(id2author_dict[neighbor])\n",
    "    answer = ', '.join(max_author)\n",
    "    questions.append({\"qid\": \"hard-dblp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "    question_id += 1\n",
    "print(questions[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'qid': 'hard-dblp-0089', 'question': 'What institutions participated in the study of Wearable Sensor-Based Prediction Model of Timed up and Go Test in Older Adults in the DBLP citation network?', 'answer': 'Univ Nebraska Omaha, Coll Informat Sci & Technol, Omaha, NE 68182 USA; Univ Nebraska Med Ctr, Dept Biostat, Omaha, NE 68198 USA; Univ Nebraska Omaha, Dept Biomech, Omaha, NE 68182 USA'}\n"
     ]
    }
   ],
   "source": [
    "# What institutions participated in the study of {title} in the DBLP citation network?\n",
    "for i in range(num_question_per_template):\n",
    "    # randomly choose a paper\n",
    "    paper_id = random.choice(list(id2title_dict.keys()))\n",
    "    title = id2title_dict[paper_id]\n",
    "    question = \"What institutions participated in the study of {} in the DBLP citation network?\".format(title)\n",
    "    answer = []\n",
    "    authors = paper_net.nodes[paper_id][\"authors\"]\n",
    "    for author in authors:\n",
    "        answer.append(author[\"org\"])\n",
    "    answer = \"; \".join(list(set(answer)))\n",
    "    questions.append({\"qid\": \"hard-dblp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "    question_id += 1\n",
    "print(questions[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'hard-dblp-0090', 'question': 'What keywords does Shubo Liu focus on most in the DBLP citation network?', 'answer': 'Image-to-image translation,  Discriminator,  Generative adversarial networks,  Attribute label,  Multi-domain,  Unpaired dataset,  Unsupervised'}, {'qid': 'hard-dblp-0091', 'question': 'What keywords does Istvan Haller focus on most in the DBLP citation network?', 'answer': 'security-in-depth, secret-free, speculative vulnerabilities, hypervisor security'}, {'qid': 'hard-dblp-0092', 'question': 'What keywords does S. A. Abramov focus on most in the DBLP citation network?', 'answer': '39A06, 68W30, 12H10'}, {'qid': 'hard-dblp-0093', 'question': 'What keywords does Lorenz Räber focus on most in the DBLP citation network?', 'answer': 'Vessel, IVUS, Gaussian process, Lumen, Segmentation, Deep learning'}, {'qid': 'hard-dblp-0094', 'question': 'What keywords does Suvi K. Holm focus on most in the DBLP citation network?', 'answer': 'Games, Focusing, Conferences'}, {'qid': 'hard-dblp-0095', 'question': 'What keywords does Zixuan Yi focus on most in the DBLP citation network?', 'answer': ' Self-supervised Learning,  Graph Neural Network, Multi-modal'}, {'qid': 'hard-dblp-0096', 'question': 'What keywords does Zhuo-Xu Cui focus on most in the DBLP citation network?', 'answer': 'Transforms, Supervised learning, Training,  low-rank, Image reconstruction, Learning systems, Magnetic resonance imaging,  Riemannian optimization,  deep learning, Sparse matrices, Data models,  manifold learning, Deep learning, Dynamic MR imaging, Imaging'}, {'qid': 'hard-dblp-0097', 'question': 'What keywords does Guillermo Barturen focus on most in the DBLP citation network?', 'answer': ' Curation, computational models,  Interferon signature,  Epigenomics,  Dataset, personalized medicine, Autoimmune disease, autoimmune diseases,  GEO,  Gene expression, molecular profiling,  Database,  Transcriptomics,  Meta-analysis, Systemic Lupus Erythematosus'}, {'qid': 'hard-dblp-0098', 'question': 'What keywords does Sabine E M J Stoltz focus on most in the DBLP citation network?', 'answer': ' violent video games,  longitudinal, aggression,  adolescence,  social networks'}, {'qid': 'hard-dblp-0099', 'question': 'What keywords does Yuanzhe Wang focus on most in the DBLP citation network?', 'answer': 'Robot sensing systems,  sensor attack, Cyber-security, Mobile robots,  anomaly detection, Service robots, Uncertainty, Three-dimensional displays, Robots, Planning, Collaboration, Kalman filters,  autonomous vehicle, Robot kinematics,  cyber-attack, Pose estimation, Navigation, Geometry, Semantics'}]\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "# What keywords does Blanca Rodriguez focus on most in the DBLP citation network?\n",
    "for i in range(num_question_per_template):\n",
    "    answer = \"\"\n",
    "    while answer == \"\":\n",
    "        # randomly choose a paper\n",
    "        author_id = random.choice(list(id2author_dict.keys()))\n",
    "        name = id2author_dict[author_id]\n",
    "        question = \"What keywords does {} focus on most in the DBLP citation network?\".format(name)\n",
    "        keywords = []\n",
    "        for neighbor_id in author_net.neighbors(author_id):\n",
    "            for paper in author_net[author_id][neighbor_id][\"papers\"]:\n",
    "                keywords.extend(paper_net.nodes[paper][\"keywords\"])\n",
    "        keywords = ', '.join(list(set(keywords)))\n",
    "        answer = keywords\n",
    "    questions.append({\"qid\": \"hard-dblp-{:0>4d}\".format(question_id), \"question\": question, \"answer\": answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "with jsonlines.open('/<YOUR_OWN_PATH>/ToolQA/data/questions/hard/dblp-hard.jsonl', 'w') as writer:\n",
    "    for row in questions:\n",
    "        writer.write(row)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
